import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
df=pd.read_csv("Dhaka_people.csv")
df.head()
| Gender | Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | 45.0 | 0.0 | 0.0 | 0.0 | Mirpur | Undeveloped | Building | Dhaka | 0.0 |
| 1 | Male | 17.0 | 0.0 | 0.0 | 1.0 | Chawkbazar | Developed | Building | Dhaka | 0.0 |
| 2 | Female | 29.0 | 0.0 | 0.0 | 0.0 | Paltan | Undeveloped | Other | Dhaka | 0.0 |
| 3 | Female | 63.0 | 1.0 | 1.0 | 0.0 | Motijheel | Developed | Other | Dhaka | 1.0 |
| 4 | Male | 22.0 | 0.0 | 0.0 | 0.0 | Gendaria | Undeveloped | Building | Dhaka | 0.0 |
from sklearn.preprocessing import LabelEncoder
led =LabelEncoder()
led.fit_transform(df['Gender'])
df['Gender'] =led.fit_transform(df['Gender'])
df.head()
| Gender | Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45.0 | 0.0 | 0.0 | 0.0 | Mirpur | Undeveloped | Building | Dhaka | 0.0 |
| 1 | 1 | 17.0 | 0.0 | 0.0 | 1.0 | Chawkbazar | Developed | Building | Dhaka | 0.0 |
| 2 | 0 | 29.0 | 0.0 | 0.0 | 0.0 | Paltan | Undeveloped | Other | Dhaka | 0.0 |
| 3 | 0 | 63.0 | 1.0 | 1.0 | 0.0 | Motijheel | Developed | Other | Dhaka | 1.0 |
| 4 | 1 | 22.0 | 0.0 | 0.0 | 0.0 | Gendaria | Undeveloped | Building | Dhaka | 0.0 |
df.shape
(1001, 10)
df.isnull()
| Gender | Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | False | False | False | False | False | False | False | False | False | False |
| 1 | False | False | False | False | False | False | False | False | False | False |
| 2 | False | False | False | False | False | False | False | False | False | False |
| 3 | False | False | False | False | False | False | False | False | False | False |
| 4 | False | False | False | False | False | False | False | False | False | False |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 996 | False | False | False | False | False | False | False | False | False | False |
| 997 | False | False | False | False | False | False | False | False | False | False |
| 998 | False | False | False | False | False | False | False | False | False | False |
| 999 | False | False | False | False | False | False | False | False | False | False |
| 1000 | False | True | True | True | True | True | True | True | True | True |
1001 rows × 10 columns
df.isnull().sum()
Gender 0 Age 1 NS1 1 IgG 1 IgM 1 Area 1 AreaType 1 HouseType 1 District 1 Outcome 1 dtype: int64
from sklearn.model_selection import train_test_split
train , test = train_test_split(df,test_size=.70, random_state=42)
train.shape
(300, 10)
test.shape
(701, 10)
test.head()
| Gender | Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| 521 | 1 | 23.0 | 1.0 | 1.0 | 0.0 | Kamrangirchar | Developed | Tinshed | Dhaka | 1.0 |
| 941 | 0 | 37.0 | 0.0 | 0.0 | 0.0 | Rampura | Developed | Building | Dhaka | 0.0 |
| 741 | 1 | 65.0 | 0.0 | 0.0 | 0.0 | Khilgaon | Developed | Tinshed | Dhaka | 0.0 |
| 980 | 1 | 11.0 | 0.0 | 0.0 | 0.0 | Banasree | Undeveloped | Other | Dhaka | 0.0 |
| 411 | 1 | 24.0 | 0.0 | 0.0 | 1.0 | Hazaribagh | Developed | Other | Dhaka | 0.0 |
test.to_csv('dhaka_testing.csv')
df1 =df.copy()
df2 =df.copy()
df3 =df.copy()
df.head()
| Gender | Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45.0 | 0.0 | 0.0 | 0.0 | Mirpur | Undeveloped | Building | Dhaka | 0.0 |
| 1 | 1 | 17.0 | 0.0 | 0.0 | 1.0 | Chawkbazar | Developed | Building | Dhaka | 0.0 |
| 2 | 0 | 29.0 | 0.0 | 0.0 | 0.0 | Paltan | Undeveloped | Other | Dhaka | 0.0 |
| 3 | 0 | 63.0 | 1.0 | 1.0 | 0.0 | Motijheel | Developed | Other | Dhaka | 1.0 |
| 4 | 1 | 22.0 | 0.0 | 0.0 | 0.0 | Gendaria | Undeveloped | Building | Dhaka | 0.0 |
df['Gender'].value_counts()
0 524 1 476 2 1 Name: Gender, dtype: int64
Female=(524/(524+476))*100
Male =(476/(524+476))*100
print('Female {} percent of total People '.format(Female))
print('Male {} percent of total People '.format(Male))
Female 52.400000000000006 percent of total People Male 47.599999999999994 percent of total People
sns.countplot(df['Gender'],color='#2B00FF')
<Axes: ylabel='count'>
sns.countplot( y = 'Gender', hue='Age', data=df )
<Axes: xlabel='count', ylabel='Gender'>
sns.countplot( x = 'Age', hue='Gender', data=df )
<Axes: xlabel='Age', ylabel='count'>
sns.countplot( x = 'Gender', hue='AreaType', data=df )
<Axes: xlabel='Gender', ylabel='count'>
sns.countplot( y = 'Area', hue='Gender', data=df )
<Axes: xlabel='count', ylabel='Area'>
sns.countplot( x = 'HouseType', hue='Gender', data=df )
<Axes: xlabel='HouseType', ylabel='count'>
sns.countplot( x = 'Outcome', hue='Gender', data=df )
<Axes: xlabel='Outcome', ylabel='count'>
sns.countplot( x = 'NS1', hue='Gender', data=df )
<Axes: xlabel='NS1', ylabel='count'>
df['AreaType'].value_counts()
Developed 501 Undeveloped 499 Name: AreaType, dtype: int64
plt.subplots(figsize=(10,6))
sns.countplot(x = 'HouseType', hue='AreaType', data=df)
<Axes: xlabel='HouseType', ylabel='count'>
plt.subplots(figsize=(10,6))
sns.countplot(y = 'Area', hue='AreaType', data=df)
<Axes: xlabel='count', ylabel='Area'>
plt.subplots(figsize=(10,6))
sns.countplot(y= 'Area', hue='HouseType', data=df)
<Axes: xlabel='count', ylabel='Area'>
from sklearn.preprocessing import LabelEncoder
label =LabelEncoder()
df1.columns
Index(['Gender', 'Age', 'NS1', 'IgG', 'IgM', 'Area', 'AreaType', 'HouseType',
'District', 'Outcome'],
dtype='object')
from pandas.core.dtypes.common import is_numeric_dtype
for column in df1.columns:
if is_numeric_dtype(df1[column]):
continue
else:
df1[column] = label.fit_transform(df1[column])
df1.head()
| Gender | Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45.0 | 0.0 | 0.0 | 0.0 | 22 | 1 | 0 | 0 | 0.0 |
| 1 | 1 | 17.0 | 0.0 | 0.0 | 1.0 | 7 | 0 | 0 | 0 | 0.0 |
| 2 | 0 | 29.0 | 0.0 | 0.0 | 0.0 | 27 | 1 | 1 | 0 | 0.0 |
| 3 | 0 | 63.0 | 1.0 | 1.0 | 0.0 | 24 | 0 | 1 | 0 | 1.0 |
| 4 | 1 | 22.0 | 0.0 | 0.0 | 0.0 | 10 | 1 | 0 | 0 | 0.0 |
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings('ignore')
df.head()
| Gender | Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45.0 | 0.0 | 0.0 | 0.0 | Mirpur | Undeveloped | Building | Dhaka | 0.0 |
| 1 | 1 | 17.0 | 0.0 | 0.0 | 1.0 | Chawkbazar | Developed | Building | Dhaka | 0.0 |
| 2 | 0 | 29.0 | 0.0 | 0.0 | 0.0 | Paltan | Undeveloped | Other | Dhaka | 0.0 |
| 3 | 0 | 63.0 | 1.0 | 1.0 | 0.0 | Motijheel | Developed | Other | Dhaka | 1.0 |
| 4 | 1 | 22.0 | 0.0 | 0.0 | 0.0 | Gendaria | Undeveloped | Building | Dhaka | 0.0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1001 entries, 0 to 1000 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Gender 1001 non-null int32 1 Age 1000 non-null float64 2 NS1 1000 non-null float64 3 IgG 1000 non-null float64 4 IgM 1000 non-null float64 5 Area 1000 non-null object 6 AreaType 1000 non-null object 7 HouseType 1000 non-null object 8 District 1000 non-null object 9 Outcome 1000 non-null float64 dtypes: float64(5), int32(1), object(4) memory usage: 74.4+ KB
x =df.drop('Gender',axis=1)
y = df[['Gender']]
x.head()
| Age | NS1 | IgG | IgM | Area | AreaType | HouseType | District | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 45.0 | 0.0 | 0.0 | 0.0 | Mirpur | Undeveloped | Building | Dhaka | 0.0 |
| 1 | 17.0 | 0.0 | 0.0 | 1.0 | Chawkbazar | Developed | Building | Dhaka | 0.0 |
| 2 | 29.0 | 0.0 | 0.0 | 0.0 | Paltan | Undeveloped | Other | Dhaka | 0.0 |
| 3 | 63.0 | 1.0 | 1.0 | 0.0 | Motijheel | Developed | Other | Dhaka | 1.0 |
| 4 | 22.0 | 0.0 | 0.0 | 0.0 | Gendaria | Undeveloped | Building | Dhaka | 0.0 |
y.head()
| Gender | |
|---|---|
| 0 | 0 |
| 1 | 1 |
| 2 | 0 |
| 3 | 0 |
| 4 | 1 |
print('Gander in 100%')
round(df.Gender.value_counts()*100/len(df),1)
Gander in 100%
0 52.3 1 47.6 2 0.1 Name: Gender, dtype: float64
Gender =df['Gender'].value_counts()
transctions = Gender.index
quantity =Gender.values
figure =px.pie(df,
values=quantity,
names =transctions,hole=.70,
title=" Dhaka City People Gender (Female , Male or Other ) ")
figure.show()
Age =df['Age'].value_counts()
transctions = Age.index
quantity =Age.values
figure =px.pie(df,
values=quantity,
names =transctions,hole=.70,
title=" Dhaka City People Age ")
figure.show()
Area =df['Area'].value_counts()
transctions = Area.index
quantity =Area.values
figure =px.pie(df,
values=quantity,
names =transctions,hole=.70,
title=" Area in Dhaka City ")
figure.show()
HouseType=df['HouseType'].value_counts()
transctions = HouseType.index
quantity =HouseType.values
figure =px.pie(df,
values=quantity,
names =transctions,hole=.70,
title=" Dhaka City People Live in HouseType ")
figure.show()
AreaType=df['AreaType'].value_counts()
transctions =AreaType.index
quantity =AreaType.values
figure =px.pie(df,
values=quantity,
names =transctions,hole=.70,
title=" Dhaka City People Live in AreaType (Developed or Undeveloped)")
figure.show()
import pandas as pd
from ydata_profiling import ProfileReport
ProfileReport(x, title="Dhaka_City-information_Analysis_Report")
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]